# download_rlj_issue.py
# RLJ (Record and Library Journal) Downloader
# Automates downloading article PDFs from the RLJ archive (OJS-based)
# - Prompts for issue URL
# - Parses Table of Contents while skipping Front Matter and Back Matter
# - For each article, extracts viewer link and then locates true PDF download URL
# - Handles multi-step download chain (article → viewer → actual file)
# - Creates dynamic folder based on Volume/Issue/Year from <title>
# - Sanitizes filenames for safe saving
# - Logs results (title, article URL, PDF URL, status) into a CSV file

import os
import re
import csv
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# Prompt for issue URL
issue_url = input("Enter RLJ issue URL: ").strip()
base_url = "https://e-journal.unair.ac.id"

# Fetch issue page
resp = requests.get(issue_url)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")

# Extract volume, issue, year from title
title_text = soup.find("title").text.strip()
vol = re.search(r"Vol\.\s*(\d+)", title_text)
iss = re.search(r"No\.\s*(\d+)", title_text)
yr = re.search(r"\((\d{4})\)", title_text)
vol, iss, yr = vol.group(1) if vol else "Vol", iss.group(1) if iss else "Issue", yr.group(1) if yr else "Year"

folder = f"RLJ_Vol{vol}_Issue{iss}_{yr}"
os.makedirs(folder, exist_ok=True)

# Setup CSV log
csv_path = os.path.join(folder, f"{folder}_log.csv")
csv_file = open(csv_path, "w", newline="", encoding="utf-8")
csv_writer = csv.writer(csv_file)
csv_writer.writerow(["Title", "Article URL", "PDF URL", "Status"])

# Parse article sections
sections = soup.find_all("div", class_="section")
count = 1
for sec in sections:
    header = sec.find("h2")
    if header and any(x in header.text for x in ["Front Matter", "Back Matter"]):
        print(f"[SKIP SECTION] {header.text.strip()}")
        continue

    articles = sec.find_all("h3", class_="title")
    for article in articles:
        title = article.get_text(strip=True)
        article_url = article.a["href"]
        print(f"[{count}] Downloading: {title}")

        try:
            # Visit article page to get viewer link
            art_page = requests.get(article_url)
            art_page.raise_for_status()
            art_soup = BeautifulSoup(art_page.text, "html.parser")
            viewer_a = art_soup.find("a", class_="obj_galley_link pdf")
            if not viewer_a:
                raise Exception("No viewer link on article page")
            viewer_url = urljoin(base_url, viewer_a["href"])

            # Visit viewer page to get final download link
            view_page = requests.get(viewer_url)
            view_page.raise_for_status()
            view_soup = BeautifulSoup(view_page.text, "html.parser")
            download_a = view_soup.find("a", href=re.compile(r"/article/download/\d+/\d+/\d+"))
            if not download_a:
                raise Exception("Download link not found")

            pdf_url = urljoin(base_url, download_a["href"])

            # Save PDF
            safe_title = re.sub(r'[\\/*?:"<>|]', "", title).strip()
            filename = os.path.join(folder, f"{safe_title}.pdf")
            pdf_resp = requests.get(pdf_url)
            if "application/pdf" not in pdf_resp.headers.get("Content-Type", ""):
                raise Exception(f"Not a PDF file (content-type={pdf_resp.headers.get('Content-Type')})")

            with open(filename, "wb") as f:
                f.write(pdf_resp.content)

            print(f"[SAVED] {safe_title}.pdf")
            csv_writer.writerow([title, article_url, pdf_url, "Downloaded"])

        except Exception as e:
            print(f"[ERROR] Failed: {title} - {e}")
            csv_writer.writerow([title, article_url, "", f"Error: {e}"])

        count += 1

csv_file.close()
print(f"\nDone! Log file saved to: {csv_path}")
